suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tinytable))
suppressPackageStartupMessages(library(rairtable))
suppressPackageStartupMessages(library(lme4))
suppressPackageStartupMessages(library(broom))
sample_metadata <- read_tsv(paste0("data/metadata.tsv")) %>%
mutate(Extraction=factor(Extraction, levels=c("ZYMO","DREX","EHEX"))) %>%
rename(dataset=Dataset)
extraction_data <- airtable("tblBcTZcRG1E9wsGO", "appQpr6MxnaiVHsHy") %>% #get base ID from Airtable browser URL
read_airtable(., fields = c("EX DNA ng","Datasets_flat"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
rename(id=1,extract=2,dataset=3) %>%
filter(dataset %in% sample_metadata$dataset) %>%
select(dataset,extract)
library_data <- airtable("tblo6AuYpxbbGw9gh", "appQpr6MxnaiVHsHy") %>% #get base ID from Airtable browser URL
read_airtable(., fields = c("Required PCR cycles","Datasets_flat"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
rename(id=1,pcr=2,dataset=3) %>%
filter(dataset %in% sample_metadata$dataset) %>%
select(dataset,pcr)
indexing_data <- airtable("tblhfsiR4NI9XJQG0", "appQpr6MxnaiVHsHy") %>% #get base ID from Airtable browser URL
read_airtable(., fields = c("Adaptors (nM)","Library (nM)","Datasets_flat"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
rename(id=1,adaptors=2,library=3,dataset=4) %>%
filter(dataset %in% sample_metadata$dataset) %>%
mutate(ratio=library/(adaptors+library)) %>%
select(dataset,adaptors,library,ratio)
preprocessing_data <- airtable("tblJfLRU2FIVz37Y1", "appQpr6MxnaiVHsHy") %>% #get base ID from Airtable browser URL
read_airtable(., fields = c("bases_pre_fastp","bases_post_fastp","host_bases","metagenomic_bases","singlem_fraction","C","EHI_plaintext"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
rename(id=1,nonpareil=C,dataset=EHI_plaintext,microbial_fraction=singlem_fraction) %>%
filter(dataset %in% sample_metadata$dataset) %>%
select(dataset,bases_pre_fastp,bases_post_fastp,bases_post_fastp,metagenomic_bases,microbial_fraction,nonpareil)
assembly_data <- airtable("tblG6ZIvkYN844I97", "appQpr6MxnaiVHsHy") %>% #get base ID from Airtable browser URL
read_airtable(., fields = c("EHI_number_api","assembly_length","N50","L50","num_contigs","num_bins","metagenomic_bases"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
rename(id=1,assembly_mapped_bases=metagenomic_bases,dataset=EHI_number_api) %>%
filter(dataset %in% sample_metadata$dataset) %>%
select(dataset,assembly_length,N50,L50,num_contigs,num_bins,assembly_mapped_bases)
mapping_data <- airtable("tblWDyQmM9rQ9wq57", "appWbHBNLE6iAsMRV") %>% #get base ID from Airtable browser URL
read_airtable(., fields = c("EHI_sample_static","MAG_mapping_percentage"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
rename(id=1,dataset=EHI_sample_static) %>%
filter(dataset %in% sample_metadata$dataset) %>%
select(dataset,MAG_mapping_percentage)
all_data <- sample_metadata %>%
mutate(Taxon=factor(Taxon,levels=c("Amphibian","Bird","Mammal","Reptile","Control"))) %>%
left_join(extraction_data, by=join_by(dataset==dataset)) %>%
left_join(library_data, by=join_by(dataset==dataset)) %>%
left_join(indexing_data, by=join_by(dataset==dataset)) %>%
left_join(preprocessing_data, by=join_by(dataset==dataset)) %>%
left_join(assembly_data, by=join_by(dataset==dataset)) %>%
left_join(mapping_data, by=join_by(dataset==dataset))
Total amount of DNA extracted from the 150 ul subset of the bead-beaten sample.
all_data %>%
group_by(Taxon,Extraction) %>%
summarise(value = sprintf("%.0f±%.0f", mean(extract), sd(extract))) %>%
pivot_wider(names_from = Extraction, values_from = value) %>%
tt(caption = "Mean and standard deviation of total DNA nanograms")
| Taxon | ZYMO | DREX | EHEX |
|---|---|---|---|
| Amphibian | 297±350 | 340±475 | 1138±1195 |
| Bird | 71±134 | 11±11 | 44±45 |
| Mammal | 448±432 | 256±224 | 867±730 |
| Reptile | 102±71 | 162±134 | 250±179 |
| Control | 0±0 | 2±3 | 1±0 |
all_data %>%
ggplot(aes(x=Extraction,y=extract))+
geom_boxplot() +
facet_grid(. ~ Taxon, scales = "free") +
labs(y="DNA yield (ng)",x="Extraction method")
all_data %>%
filter(Taxon != "Control") %>%
lmerTest::lmer(extract ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()
| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 229.412625000 | 129.0917 | 1.7771296 | 18.62337 | 0.091883533 |
| fixed | NA | ExtractionDREX | -37.448812500 | 100.5929 | -0.3722808 | 59.99998 | 0.710995523 |
| fixed | NA | ExtractionEHEX | 345.333000000 | 100.5929 | 3.4329752 | 59.99998 | 0.001087593 |
| ran_pars | Sample | sd__(Intercept) | 0.003841839 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 373.178642666 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 348.464099979 | NA | NA | NA | NA |
Number of PCR cycles required for reaching the plateau phase of the indexing PCR.
all_data %>%
group_by(Taxon,Extraction) %>%
summarise(value = sprintf("%.1f±%.1f", mean(pcr), sd(pcr))) %>%
pivot_wider(names_from = Extraction, values_from = value) %>%
tt(caption = "Mean and standard deviation of optimal number of PCR cycles")
| Taxon | ZYMO | DREX | EHEX |
|---|---|---|---|
| Amphibian | 14.2±2.1 | 10.8±3.3 | 10.2±2.6 |
| Bird | 18.0±2.5 | 18.5±3.9 | 14.8±2.6 |
| Mammal | 11.0±4.0 | 10.2±1.9 | 10.3±2.2 |
| Reptile | 11.7±4.2 | 10.3±3.7 | 12.0±3.5 |
| Control | 20.0±2.8 | 19.8±0.5 | 22.0±4.2 |
all_data %>%
ggplot(aes(x=Extraction,y=pcr))+
geom_boxplot() +
facet_grid(. ~ Taxon, scales = "free") +
labs(y="Optimal number of PCR cycles",x="Extraction method")
all_data %>%
filter(Taxon != "Control") %>%
lmerTest::lmer(pcr ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()
| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 13.708333 | 0.9696194 | 14.137850 | 22.50455 | 1.117002e-12 |
| fixed | NA | ExtractionDREX | -1.250000 | 0.8896915 | -1.404981 | 60.00000 | 1.651832e-01 |
| fixed | NA | ExtractionEHEX | -1.875000 | 0.8896915 | -2.107472 | 60.00000 | 3.926431e-02 |
| ran_pars | Sample | sd__(Intercept) | 0.000000 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 2.555902 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 3.081982 | NA | NA | NA | NA |
Nonpareil estimate of the metagenomic complexity after removing host DNA.
all_data %>%
group_by(Taxon,Extraction) %>%
summarise(value = sprintf("%.1f±%.1f", mean(nonpareil), sd(nonpareil))) %>%
pivot_wider(names_from = Extraction, values_from = value) %>%
tt(caption = "Mean and standard deviation of extimated metagenomic completeness")
| Taxon | ZYMO | DREX | EHEX |
|---|---|---|---|
| Amphibian | 0.9±0.1 | 0.8±0.1 | 0.8±0.1 |
| Bird | 0.9±0.1 | 1.0±0.0 | 0.8±0.4 |
| Mammal | 0.8±0.2 | 0.8±0.1 | 0.7±0.3 |
| Reptile | 0.9±0.1 | 0.9±0.0 | 0.9±0.1 |
| Control | 0.0±0.0 | 0.5±0.6 | 0.5±0.7 |
all_data %>%
ggplot(aes(x=Extraction,y=nonpareil))+
geom_boxplot() +
facet_grid(. ~ Taxon, scales = "free") +
labs(y="Nonpareil completeness",x="Extraction method")
all_data %>%
filter(Taxon != "Control") %>%
lmerTest::lmer(nonpareil ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()
| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 0.885552853 | 0.03450229 | 25.6664948 | 40.89542 | 7.753071e-27 |
| fixed | NA | ExtractionDREX | 0.005536892 | 0.04336611 | 0.1276779 | 48.00020 | 8.989373e-01 |
| fixed | NA | ExtractionEHEX | -0.112193866 | 0.04336611 | -2.5871326 | 48.00020 | 1.276294e-02 |
| ran_pars | Sample | sd__(Intercept) | 0.059565487 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 0.035030813 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 0.150224598 | NA | NA | NA | NA |
alpha_data <- list.files(path = "results", pattern = "^alpha_.*\\.tsv$", full.names = TRUE) %>%
map_df(~ read_tsv(.)) %>%
left_join(sample_metadata,by= join_by(dataset==dataset))
alpha_data %>%
pivot_longer(!c(dataset,Library,Species,Taxon,Sample,Extraction), names_to = "metric", values_to = "value") %>%
mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic"))) %>%
ggplot(aes(x=Extraction,y=value))+
geom_boxplot() +
facet_grid(metric ~ Taxon, scales = "free")
alpha_data %>%
lmerTest::lmer(richness ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()
| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 58.43750 | 12.008400 | 4.8663854 | 8.87726 | 0.0009237868 |
| fixed | NA | ExtractionDREX | 2.18750 | 4.698926 | 0.4655319 | 32.00001 | 0.6447035294 |
| fixed | NA | ExtractionEHEX | 2.43750 | 4.698926 | 0.5187355 | 32.00001 | 0.6075141409 |
| ran_pars | Sample | sd__(Intercept) | 15.62413 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 30.71216 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 13.29057 | NA | NA | NA | NA |
alpha_data %>%
lmerTest::lmer(neutral ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()
| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 29.71528470 | 5.751141 | 5.16685008 | 8.745982 | 0.0006458923 |
| fixed | NA | ExtractionDREX | -1.59472963 | 2.085900 | -0.76452835 | 31.999998 | 0.4501538272 |
| fixed | NA | ExtractionEHEX | -0.04935512 | 2.085900 | -0.02366131 | 31.999998 | 0.9812697035 |
| ran_pars | Sample | sd__(Intercept) | 9.00571703 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 14.37531297 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 5.89981588 | NA | NA | NA | NA |
alpha_data %>%
lmerTest::lmer(phylogenetic ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
broom.mixed::tidy() %>%
tt()
| effect | group | term | estimate | std.error | statistic | df | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 4.5870074 | 0.4577012 | 10.021838 | 8.589824 | 4.977993e-06 |
| fixed | NA | ExtractionDREX | 0.2122362 | 0.1485377 | 1.428837 | 32.000064 | 1.627406e-01 |
| fixed | NA | ExtractionEHEX | 0.2230529 | 0.1485377 | 1.501658 | 32.000064 | 1.429897e-01 |
| ran_pars | Sample | sd__(Intercept) | 1.1278928 | NA | NA | NA | NA |
| ran_pars | Species | sd__(Intercept) | 0.9754991 | NA | NA | NA | NA |
| ran_pars | Residual | sd__Observation | 0.4201282 | NA | NA | NA | NA |
variance_data <- list.files(path = "results", pattern = "^var_.*\\.tsv$", full.names = TRUE) %>%
map_df(~ read_tsv(.))
variance_data %>%
left_join(sample_metadata %>% select(Species,Taxon) %>% unique(),by=join_by(species==Species)) %>%
mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic"))) %>%
ggplot(aes(x=Taxon,y=r2)) +
geom_boxplot()+
ylim(0,1)+
facet_grid(. ~ metric, scales = "free")
variance_data %>%
group_by(metric) %>%
summarise(mean=mean(r2),sd=sd(r2)) %>%
tt()
| metric | mean | sd |
|---|---|---|
| neutral | 0.06201935 | 0.04744380 |
| phylogenetic | 0.07381277 | 0.06884687 |
| richness | 0.16970818 | 0.06626388 |